package com.ulewo.utils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.Iterator;
import java.util.List;

import com.mysql.jdbc.log.Log;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;

import com.ulewo.po.enums.DateTimePatternEnum;
import com.ulewo.po.model.Spider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * TODO: 增加描述
 * @author luo.hl
 * @version 0.1.0
 */
public class SpiderUtil {

    static Logger logger = LoggerFactory.getLogger(SpiderUtil.class);

    private static final String URL_SPIDER = "http://m.oschina.net";

    private static final String URL_MAIN_DOMAIN = "http://www.oschina.net";

    public static List<Spider> getOSCNewsList() throws ClientProtocolException, IOException {
        List<Spider> listSpider = new ArrayList<Spider>();
        String html = NetUtils.getUrl(URL_SPIDER, null);
        HtmlCleaner htmlCleaner = new HtmlCleaner();
        TagNode allNode = htmlCleaner.clean(html);
        TagNode[] listNodes = allNode.getElementsByAttValue("data-role", "listview", true, true);
        TagNode ulTag = listNodes[0];
        TagNode[] liTags = ulTag.getElementsByName("li", true);
        for (TagNode litag : liTags) {
            try {
                String linkUrl = litag.getElementsByAttValue("class", "ui-link-inherit", true, true)[0].getAttributeByName("href");
                String title = litag.getElementsByAttValue("class", "ui-li-heading", true, true)[0].getText().toString();
                if (linkUrl.contains("/news")) {
                    Spider spider = new Spider();
                    spider.setTitle(title);
                    spider.setUrl(URL_MAIN_DOMAIN + linkUrl);
                    listSpider.add(spider);
                }
            } catch (Exception e) {
                logger.error("抓取新闻异常", e);
            }
        }
        return listSpider;
    }

    public static String getDetail(String url) throws ClientProtocolException, IOException {
        String html = NetUtils.getUrl(url, null);
        HtmlCleaner htmlCleaner = new HtmlCleaner();
        TagNode allNode = htmlCleaner.clean(html);
        TagNode contentNode = allNode.getElementsByAttValue("class", "editor-viewer text clear", true, true)[0];

        //去除广告
        TagNode[] adTags = allNode.getElementsByAttValue("data-traceid", "news_detail_above_text_link_1", true, true);
        if (null != adTags && adTags.length > 0) {
            adTags[0].getParent().removeFromTree();
        }

        //去除缩略图
        TagNode[] thumbTags = allNode.getElementsByAttValue("class", "thumb", true, true);
        if (null != thumbTags && thumbTags.length > 0) {
            thumbTags[0].removeFromTree();
        }

        TagNode[] images = contentNode.getElementsByName("img", true);
        if (images.length > 0) {
            for (TagNode tag : images) {
                try {
                    tag.removeAttribute("alt");
                    String src = tag.getAttributeByName("src");
                    String newSrc = null;
                    if (!src.contains("http") && !src.contains("https")) {
                        tag.removeFromTree();
                        continue;
                    }
                    newSrc = uploadImage(src);
                    if (!newSrc.contains("emotion") && null != newSrc) {
                        tag.addAttribute("data-original", newSrc);
                        tag.removeAttribute("src");
                        tag.addAttribute("class", "lazy-load");
                    } else {
                        tag.removeFromTree();
                    }
                    Thread.sleep(1000);
                } catch (Exception e) {
                    continue;
                }
            }
        }
        String content = htmlCleaner.getInnerHtml(contentNode);
        content = content.trim();
        return content;
    }

    private static String uploadImage(String oldsrc) {
        HttpURLConnection.setFollowRedirects(false);
        InputStream is = null;
        OutputStream os = null;
        String dirPath = getFolder();
        String saveName = "";
        try {
            HttpURLConnection conn = (HttpURLConnection) new URL(oldsrc).openConnection();
            String type = getFileType(oldsrc);
            saveName = Long.toString(new Date().getTime()) + "_p" + type;
            File savetoFile = new File(ServerUtils.getImageFolder() + dirPath + "/" + saveName);
            is = conn.getInputStream();
            os = new FileOutputStream(savetoFile);
            int b;
            while ((b = is.read()) != -1) {
                os.write(b);
            }
            os.flush();
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (os != null) {
                    os.close();
                }

            } catch (IOException e) {
                e.printStackTrace();
            }
            try {
                if (is != null) {
                    is.close();
                }
            } catch (IOException e) {
            }
        }
        return ServerUtils.getImageDomain() + "upload/" + dirPath + "/" + saveName;
    }

    public static String getFileType(String fileName) {

        String[] fileType = {".gif", ".png", ".jpg", ".jpeg", ".bmp", ".GIF", ".PNG", ".JPG", ".JPEG", ".BMP"};
        Iterator<String> type = Arrays.asList(fileType).iterator();
        while (type.hasNext()) {
            String t = type.next();
            if (fileName.endsWith(t)) {
                return t;
            }
        }
        return ".jpg";
    }

    private static String getFolder() {
        String dirPath = DateUtil.format(new Date(), DateTimePatternEnum.YYYYMM.getPattern());
        File dir = new File(ServerUtils.getImageFolder() + dirPath);
        if (!dir.exists()) {
            try {
                dir.mkdirs();
            } catch (Exception e) {
                return dirPath;
            }
        }
        return dirPath;
    }

    public static void main(String[] args) {
       /* int max = 10000;
        for (int i = 0; i < max; i++) {
            String url = "http://survey.stnts.com:8899/saveUserInfo.do?surveyId=228&batchNo=13&userName=1&sex=1&mobile=1&qq=1%4011.com";
            try {
                String result = getHtml(url);
                System.out.println(result);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }*/
        String html = null;
        try {
            html = getDetail("http://www.oschina.net/news/82609/tinyshop-2-6");
        } catch (IOException e) {
            e.printStackTrace();
        }
        System.out.print(html);
    }
}
